'''


exec(''.join(open("/workERS/daltonm/corona/ppp/codefiles/CRppppt1.py", encoding="utf8").readlines()[:]))
nohup python3 /workERS/daltonm/corona/ppp/codefiles/CRppppt1.py  | tee &
'''

Dppp = 0

import pandas as pd
import numpy as np
import matplotlib as mpl

filename = 'CRppppt1'

dataloc=  "/dataERS/eract/daltonm/"
resultsloc="/workERS/daltonm/corona/ppp/results/"
ANfile="CRppppt1"

import os
import sys
sys.path.append('/daltonm/corona')
#from basicfunctions import *
from basicfunctions import *

resultsloc1 = "/daltonm/corona/"

from datetime import date
datestr = date.today().strftime(format="%Y%m%d")
resultsloc = resultsloc1 + datestr + '/'
if not os.path.exists(resultsloc):
    os.makedirs(resultsloc)


logging.basicConfig(filename=resultsloc + filename + '.txt', level=logging.ERROR,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('This message should go to the log file')
logging.info('So should this')
logging.warning('And this, too')
logging.exception('And this, too')
logging.captureWarnings(True)



'''
coming from geocoding_user_guide setn by wen luo
'''
from zipfile import ZipFile
#import zipfile
import os
from os.path import basename

geocodechars = [20, 35, 35, 30, 2, 5, 4, 2]
geocodenames = ['ID', 'addr1', 'addr2', 'city', 'abbr', 'zip5', 'zip4', 'end']


def ldbaddressout(dft, addrvars, geocodechars=geocodechars, geocodenames=geocodenames):
    addr1, addr2, city, abbr, zip5, zip4 = addrvars
    dft['addr1'] = dft[addr1].fillna('')
    dft['addr2'] = dft[addr2].fillna('')
    dft['city'] = dft[city].fillna('')
    dft['abbr'] = dft[abbr].fillna('')
    dft['zip5'] = dft[zip5].fillna('')
    dft['zip4'] = dft[zip4].fillna('')
    dft['end'] = ' \n'
    for i, j in enumerate(geocodenames):
        width = geocodechars[i]
        dft[j] = dft[j].fillna('')
        dft[j] = dft[j].apply(lambda x: str(x) + (width - len(str(x))) * ' ')
    dft['string'] = dft[geocodenames].sum(axis=1)
    return (''.join(dft['string'].values))


if Dppp==1:
    # create a ZipFile object
    dfp = []
    fnames = ['public_up_to_150k_'+str(i)+'_220703' for i in range(1,13)] + ['public_150k_plus_220703']
    for file in fnames:
        dfp1 = pd.read_csv(dataloc + 'pppfiles/' + file + '.csv', encoding="ISO-8859-1", error_bad_lines=False).drop_duplicates()
        dfp1['Name of File'] = file
        dfp.append(dfp1)



    # fnames = ['FY2020Q1-P12_All_FA_AccountBreakdownByAward_2022-05-16_H01M20S29192467.zip',
    #           'FY2021P01-P12_All_FA_AccountBreakdownByAward_2022-05-15_H15M41S49402076.zip',
    #           'FY2022P01-P06_All_FA_AccountBreakdownByAward_2022-05-15_H15M29S41730876.zip']
    # for fname in fnames:
    #     with ZipFile(dataloc + 'pppfiles/' + fname, 'r') as tzip:
    #         # printing all the contents of the zip file
    #         print(tzip.printdir())
    # file = ZipFile(dataloc + 'pppfiles/' + fnames[0])
    # df = pd.read_csv(file.open("FY2020Q1-P12_All_FA_AccountBreakdownByAward_2022-05-16_H01M20S45_1.csv"), encoding='latin1')
    # data_df = pd.read_csv(file.open("Production_Crops_E_All_Data.csv"), encoding='latin1')


    dfp = pd.concat(dfp, ignore_index=True)

    '''
    ['ACTIONDATE', 'ACTIONTYPE', 'ASSISTANCETYPE', 'AWARDDESC',
           'AWARDEEORRECIPIENTLEGALENTITYNAME',
           'AWARDEEORRECIPIENTLEGALENTITYNAMEANDDOINGBUSINESSAS',
           'AWARDEEORRECIPIENTUNIQUEIDENTIFIER', 'AWARDINGAGENCYCD',
           'AWARDINGOFFICECD', 'AWARDINGSUBTIERAGENCYCD',
           'AWARDMODIFICATIONAMENDMENTNUM', 'Address', 'BUSINESSFUNDSINDICATOR',
           'BUSINESSTYPES', 'BusinessName', 'BusinessType', 'CD', 'CFDA_NUM',
           'CORRECTIONLATEDELETEIND', 'City', 'DateApproved',
           'FACEVALUEOFDIRECTLOANORLOANGUARANTEE', 'FAIN',
           'FEDERALACTIONOBLIGATION', 'FISCALYEARANDQTRCORRECTION',
           'FUNDINGAGENCYCD', 'FUNDINGOFFICECD', 'FUNDINGSUBTIERAGENCYCD',
           'Gender', 'JobsReported', 'LEGALENTITYADDRLINE1',
           'LEGALENTITYADDRLINE2', 'LEGALENTITYADDRLINE3', 'LEGALENTITYCITYNAME',
           'LEGALENTITYCONGRESSIONALDISTRICT', 'LEGALENTITYCOUNTRYCD',
           'LEGALENTITYFOREIGNCITYNAME', 'LEGALENTITYFOREIGNPOSTALCD',
           'LEGALENTITYFOREIGNPROVINCENAME', 'LEGALENTITYSTATECD',
           'LEGALENTITYZIP5', 'LEGALENTITYZIPLAST4', 'Lender', 'LoanAmount',
           'NAICSCode', 'NONFEDERALFUNDINGAMOUNT', 'Name of File', 'NonProfit',
           'ORIGINALLOANSUBSIDYCOST', 'PERIODOFPERFORMANCECURRENTENDDATE',
           'PERIODOFPERFORMANCESTARTDATE', 'PRIMPLACEOFPERFORMANCECD',
           'PRIMPLACEOFPERFORMANCECONGRESSIONALDISTRICT',
           'PRIMPLACEOFPERFORMANCECOUNTRYCD',
           'PRIMPLACEOFPERFORMANCEFOREIGNLOCATIONDESC',
           'PRIMPLACEOFPERFORMANCEZIP+4', 'RECORDTYPE', 'RaceEthnicity', 'SAI_NUM',
           'State', 'URI', 'Unnamed: 0', 'Veteran', 'Zip']
    ppp cols
    ['Address', 'BusinessName', 'BusinessType', 'CD', 'City',
     'DateApproved', 'Gender', 'JobsReported', 'Lender', 'LoanAmount', 
     'NAICSCode', 'Name of File', 'NonProfit', 'RaceEthnicity', 'State', 'Veteran', 'Zip']
    
    as of 5/15/22
    ['ID',
     'DateApproved',
     'SBAOfficeCode',
     'ProcessingMethod',
     'BorrowerName',
     'BorrowerAddress',
     'BorrowerCity',
     'BorrowerState',
     'BorrowerZip',
     'LoanStatusDate',
     'LoanStatus',
     'Term',
     'SBAGuarantyPercentage',
     'InitialApprovalAmount',
     'CurrentApprovalAmount',
     'UndisbursedAmount',
     'ServicingLenderLocationID',
     'ServicingLenderName',
     'ServicingLenderAddress',
     'ServicingLenderCity',
     'ServicingLenderState',
     'ServicingLenderZip',
     'RuralUrbanIndicator',
     'HubzoneIndicator',
     'LMIIndicator',
     'BusinessAgeDescription',
     'ProjectCity',
     'ProjectCountyName',
     'ProjectState',
     'ProjectZip',
     'CD',
     'JobsReported',
     'NAICSCode',
     'Race',
     'Ethnicity',
     'PAYROLL_PROCEED',
     'BusinessType',
     'OriginatingLenderLocationID',
     'OriginatingLender',
     'OriginatingLenderCity',
     'OriginatingLenderState',
     'Gender',
     'Veteran',
     'ForgivenessAmount',
     'ForgivenessDate',
     'Name of File',
     'st',
     'State']
    
    
    '''

    dfp['st'] = dfp['BorrowerState'].copy()
    dfp['State'] = dfp['st'].apply(lambda x: stateinfo(x, 'name'))

    dfp.rename(columns = {'LoanNumber' : 'ID'}, inplace=True)


    t = dfp.isna().sum() /1e6
    #t1 = dfp[~pppcond].isna().sum() /1e6
    #columns with over 5 million missing
    pppcols = [k for k,v in t.to_dict().items() if v<5]
    #eidlcols = [k for k,v in t1.to_dict().items() if v<9]

    #ppp data
    dfp[pppcols].to_csv(dataloc + 'pppfiles/all_ppp.psv', sep = '|')
    #eidl data
    #savesplit(dfp[~pppcond][eidlcols], 'all_eidl')


    dfp['zip4'] = dfp['BorrowerZip'].apply(lambda x: str(x)[-5:])
    cond = (dfp['zip4'].str.startswith('-'))
    dfp['zip4'] = np.where(cond,dfp['zip4'].str[1:], '' )
    dfp['zip5'] = dfp['BorrowerZip'].astype('str').str[:5]
    dfp['blank'] = ''
    addrvars = ['BorrowerAddress', 'blank', 'BorrowerCity', 'st', 'zip5', 'zip4']
    textstring = ldbaddressout(dfp, addrvars)
    with open(dataloc + 'ppp.txt', 'w') as text_file:
        print(textstring, file=text_file)



    '''
    ************************************************************************
    ************************************************************************
    ************************************************************************
    eidl
    ************************************************************************
    ************************************************************************
    ************************************************************************
    '''

    fnames = ['FY2022P01-P06_All_FA_AccountBreakdownByAward_2022-05-15_H15M29S41730876',
              'FY2021P01-P12_All_FA_AccountBreakdownByAward_2022-05-15_H15M41S49402076',
              'FY2020Q1-P12_All_FA_AccountBreakdownByAward_2022-05-16_H01M20S29192467'
              ]
    dfp = []
    for file in fnames:
        with zipfile.ZipFile(dataloc + 'pppfiles/'+file + '.zip') as z:
            for cont in [i for i in z.namelist() if i.endswith('csv')]:
                with z.open(cont) as f:
                    dfp1 = pd.read_csv(f, encoding="ISO-8859-1", error_bad_lines=False).drop_duplicates()
                    #keep only if not PPP
                    kcond = (dfp1['cfda_title'] != 'PAYCHECK PROTECTION LOAN PROGRAM (PPP)')
                    dfp1 = dfp1[kcond]
                    dfp1['Name of File'] = file
                    dfp.append(dfp1)
                    #keep only disbursed loans
    dfp = pd.concat(dfp)
    #upload text file to
    # \\filer6\dbes\geocoding\ftp\erpds\Dalton_M

    '''
    doing the same but for eidl data
    '''


    addrvars = ['LEGALENTITYADDRLINE1', 'LEGALENTITYADDRLINE2', 'LEGALENTITYCITYNAME', 'LEGALENTITYSTATECD', 'LEGALENTITYZIP5', 'LEGALENTITYZIPLAST4']
    textstring = ldbaddressout(dfp[~pppcond], addrvars)
    with open(dataloc + 'eidl.txt', 'w') as text_file:
        print(textstring, file=text_file)




'''
ldb secondary addresses
'''
dataloc1="/workERS/daltonm/BG/"
keepcols1 = ['ldb_num',  'physical_addr_line_1',
       'physical_addr_line_2', 'physical_city', 'physical_state',
       'physical_zip', 'physical_zip_exp', 'ui_addr_line_1',
       'ui_addr_line_2', 'ui_city', 'ui_state', 'ui_zip', 'ui_zip_exp',
       ]

states="ak,al,ar,az,ca,co,ct,dc,de,fl,ga,hi,ia,id,il,in,ks,ky,la,ma,md,me,mi,mn," \
           "mo,ms,mt,nc,nd,ne,nh,nj,nm,nv,ny,oh,ok,or,pa,ri,sc,sd,tn,tx,ut,va,vt,wa,wi,wv,wy,pr,vi".split(',')


temptext = open(dataloc1+'ldbtext.txt').read()

#yearlist=list(range(2006,2019))
yearlist=[2019, 2020, 2021]

'''
coming from geocoding_user_guide setn by wen luo
'''




def ldbappend3(st, yr,fcolspecs,cols,dataloc,keepcols1):
    onlyfiles = [dataloc + st + '/' + f for f in listdir(dataloc + st) if isfile(join(dataloc + st, f))]
    txtphys = ''
    txt = ''
    txtoth = ''
    for i in onlyfiles:
        if "udb" + st + ".xtrt" + yr + "4.f875.gz" in i:
            dft2 = pd.read_fwf(i, compression='gzip',
                               error_bad_lines=False,
                               colspecs=fcolspecs, names=cols)
            #i believe this gets rid of master records
            dft2 = dft2[dft2['meei'] != 9]
            #this is to identify unique UI addresses
            ### already filled in ui address for physical address where physical is missing, so want to removet hese ons
            condphys = (dft2['physical_addr_line_1'].notnull())
            condui = (dft2['physical_addr_line_1'] != dft2['ui_addr_line_1']) & (dft2['ui_addr_line_1'].notnull()) #& (dft2['physical_addr_line_1'].notnull())
            #this identifies unique other addresses
            condoth = (dft2['physical_addr_line_1'] != dft2['other_addr_line_1']) & (dft2['other_addr_line_1'].notnull())
            dft2['ID'] = dft2['ldb_num'].apply(lambda x: str(year) + str(int(x)))
            addrvarsphys = ['physical_addr_line_1', 'physical_addr_line_2', 'physical_city', 'physical_state', 'physical_zip', 'physical_zip_exp']
            addrvarsui = ['ui_addr_line_1', 'ui_addr_line_2', 'ui_city', 'ui_state', 'ui_zip', 'ui_zip_exp']
            addrvarsoth = ['other_addr_line_1', 'other_addr_line_2', 'other_city', 'other_state', 'other_zip', 'other_zip_exp']
            txtphys = txtphys + ldbaddressout(dft2[condphys], addrvarsphys)
            txt = txt + ldbaddressout(dft2[condui], addrvarsui)
            txtoth = txtoth + ldbaddressout(dft2[condoth], addrvarsoth)
    return ([txtphys, txt, txtoth])

try:
    dataloc2 = "/dataERS/eract/daltonm/"
    for yearnum in reversed(yearlist):
        textstring = ''
        year = str(yearnum)
        yr = str(year[-2:])
        dataloc = "/ldb" + year + "/LDB" + yr + "4/"
        dft1 = pd.read_csv(io.StringIO(temptext), header=0, sep=" ")
        dft1.sort_values('start', inplace=True)
        dft1['start'] = dft1['start'] - 1
        dft1['end'] = dft1['start'] + dft1['number']
        # temp test
        dft1['test1'] = dft1['start'].shift(-1)
        dft1['test'] = dft1['start'] + dft1['number']
        dft1[dft1['test'] != dft1['test1']]
        fcolspecs = dft1[['start', 'end']].apply(tuple, axis=1).tolist()
        # fcolspecs=[dft1['start'],dft1['start']+dft1['number']].tolist()
        fwidths = dft1['number'].tolist()
        cols = dft1['name'].tolist()
        if yearnum in [2007] + list(range(2015, 2029)):
            procs = 20
            p = mp.Pool(procs)
            ldbappendpart = partial(ldbappend3, yr=yr, fcolspecs=fcolspecs, cols=cols, dataloc=dataloc,
                                    keepcols1=keepcols1)
            results = p.map(ldbappendpart, states)
            p.close()
            p.join()
            textstringphys = ''.join([i[0] for i in results])
            textstring = ''.join([i[1] for i in results])
            textstringoth = ''.join([i[2] for i in results])
        # with open(dataloc2 + 'ph' + yr +'.txt','w') as text_file:
        #     print(textstringphys, file=text_file)
        with open(dataloc2 + 'ui' + yr +'.txt','w') as text_file:
            print(textstring, file=text_file)
        # with open(dataloc2 + 'oth' + yr +'.txt','w') as text_file:
        #     print(textstringoth, file=text_file)
        logging.debug(year)

except Exception:
    logging.exception('fatal error')
